Prime vs TIRE-seq compare metrics

Recap

I sequenced these samples with Prime-seq on extracted RNAs already. Results were not that great. Here reprocess the same samples with TIRE-seq for a head to head comparison.

Samples

  • Yasmin Nouri plates.
  • All wells have 10,000 cells in 50ul total volume.
  • Comprised of 25ul cells in media and 25ul 2x buffer TCL
  • The cell concentration is therefore 200 cells/uL.
  • So 20uL input will be 4000 cells.

Lab notes

The processing went as planned. Full writeup available at ELN link

Downsampling notes

This was doone in the notebook scripts /S000553/downsample_humanTcell.sh

Prime-seq = MB2_PRM0003_YN sequenced on 2 runs:
* S000514 40M * S000493 149.8M * Combined = 189.8M / 192 samples = 988k reads per well * The object I import was generated in G000233_DRAC_notebooks/MB2_Tcell/1A_generate_SCE.rmd

TIRE-Seq

  • S000553 160.0M / 96 samples = 1.66M reads per well
  • Therefore need to downsample this to 988k * 96 = 94,848,000 reads for the library

Read SCEs

This is generated in 1A notebook generate SCE.

tire <- readRDS(here::here(
  "data/TIRE_Tcell/SCEs/Downsample/tire_tcell_ds_basic.sce.rds"
))

prime <- readRDS(here::here(
  "data/TIRE_Tcell/SCEs/Downsample/prime_tcell_ds_basic.sce.rds"
))

Extract the sample metadata and unify before merging into a tibble.
Keep only the untransduced normal T cells in the Prime-seq experiment.

tire_tb <- as_tibble(colData(tire))
tire_tb$Protocol <- "TIRE"

prime_tb <- as_tibble(colData(prime))
prime_tb$Protocol <- "Prime"

keep_cols <- intersect(
  colnames(tire_tb),
  colnames(prime_tb)
)

tb <- rbind(
  tire_tb[,keep_cols],
  prime_tb[,keep_cols]
)

Quality control metrics

All metrics are better for TIRE-seq

Reads and UMIs

Better for TIRE-seq

plt1 <- ggplot(data=tb, aes(y=sum+1, x=Reads+1, colour=Protocol)) +
  geom_point() +
  scale_colour_brewer(type="qualitative", palette = "Dark2") +
  xlab("Reads") + ylab("Library size (UMIs)") +
  scale_y_continuous(trans='log10', limits = c(1, 1e7)) +
  scale_x_continuous(trans='log10', limits = c(1, 1e7)) +
  annotation_logticks(base = 10, sides = "bl") +
  coord_fixed(ratio = 1)

plt1

UMIs per timepoint

This show the activation of transcription at day 1 and 2

tb <- tb %>% 
  mutate(Day = as.numeric(str_split(pattern = "_", Timepoint, simplify = TRUE)[,2]))

plt5 <- ggplot(data = tb, aes(y = sum + 1, x = Day, colour = Protocol)) +
  geom_point() +
  geom_smooth(se = TRUE, method = "loess", span = 0.6) + # Adjust `span` for better fit
  scale_colour_brewer(type = "qualitative", palette = "Dark2") +
  xlab("Day") + 
  ylab("Library size (UMIs)") +
  scale_y_continuous(trans = 'log10', limits = c(1e3, 1e7)) +
  annotation_logticks(base = 10, sides = "l")

plt5

Mapping stats

Better for TIRE-seq.

mapping <- tb %>% 
  select(Intergenic:Reads, Protocol,Well) %>% 
  pivot_longer(cols = c(Intergenic, Exon, Ambiguity, Unmapped),
               names_to = "Feature",
               values_to = "Count") %>%
  mutate(Percentage = Count / Reads * 100)

plt2 <- ggplot(mapping,
             aes(x = Feature, y= Percentage, colour = Protocol)) + 
  geom_boxplot() +
  ylab("Percent") + ylim(0,100) +
  xlab("") +
  scale_colour_brewer(palette = "Dark2")

plt2

Mitochondrial gene percent

Very low which is good.

plt3 <- ggplot(tb,
             aes(x = Protocol, y= subsets_Mito_percent, colour = Protocol)) + 
  geom_boxplot() + geom_jitter() +
  ylab("Mitochondrial %") + 
  xlab("") +
  scale_colour_brewer(palette = "Dark2")

plt3

Genes detected

Very low which is good.

plt4 <- ggplot(tb,
             aes(x = Protocol, y= detected, colour = Protocol)) + 
  geom_boxplot() + geom_jitter() +
  ylab("Genes detected") + 
  xlab("") +
  scale_colour_brewer(palette = "Dark2")

plt4

Gene correlation

rowData(tire)$sum <- rowSums2(counts(tire))
rowData(tire)$protocol <- "TIRE"

rowData(prime)$sum <- rowSums2(counts(prime))
rowData(prime)$protocol <- "Prime"

gene_tb <- as_tibble(rbind(
  rowData(tire),
  rowData(prime)
))

# Add 1 before logging
gene_tb$sum <- gene_tb$sum + 1

Convert to wide tibble

gene_tb_wide <- gene_tb %>%
  select(ID,Symbol,sum,protocol) %>% 
  pivot_wider(names_from = protocol, values_from = sum)

gene_tb_wide
gene_tb_wide_clean <- na.omit(gene_tb_wide)
correlation <- cor(log10(gene_tb_wide_clean$Prime), log10(gene_tb_wide_clean$TIRE), method = "pearson")

Plot the result of gene gene correlation

The correlation is 0.8866417

plt4 <- ggplot(gene_tb_wide_clean, 
             aes(x = Prime, y= TIRE)) + 
  geom_point(alpha = 0.33, size=1) + 
  guides(colour = guide_legend(override.aes = list(size=3, alpha=1))) +
  xlab("Prime-seq") + 
  ylab("TIRE-seq") +
  ggtitle("Counts by gene") +
  geom_smooth(method = "lm", formula = y ~ x, color = "red", 
              se = TRUE, level = 0.95) +
  scale_y_continuous(trans='log10', limits = c(1, 1e6)) +
  scale_x_continuous(trans='log10', limits = c(1, 1e6)) +
  annotation_logticks(base = 10, sides = "bl") +
  theme_Publication(base_size = 18)

plt4

Differential expression testing

Check what genes are different between Prime and TIRE-seq particularly some outliers detected in Prime-seq.

First need to combine the objects.

keep_genes <- intersect(
  row.names(tire),
  rownames(prime)
)

counts <- cbind(
  counts(tire[keep_genes]),
  counts(prime[keep_genes])
)

dge <- DGEList(counts = counts, 
               group = tb$Protocol)

Perform preprocessing steps

keep <- filterByExpr(dge, group=dge$samples$Timepoint, min.count=1)
dge <- dge[keep,]
dge <- calcNormFactors(dge)
design <- model.matrix(~group, data=dge$samples)
dge <- estimateDisp(dge, design)
fit <- glmQLFit(dge, design)

Perform the DE test

qlf <- glmQLFTest(fit, coef = 2)
results <- as_tibble(topTags(qlf, n = Inf)$table)
results$Symbol <- row.names(topTags(qlf, n = Inf)$table)
sig_genes <- results %>%
  filter(FDR < 0.05, abs(logFC) > 1)

sig_genes

Create the MA plot

ggplot(results, aes(x = logCPM, y = logFC)) +
  geom_point(aes(color = FDR < 0.05), alpha = 0.3, size=0.75) +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  scale_color_manual(values = c("grey", "darkblue")) +
  labs(
    x = "Log2 CPM",
    y = "Log2 FC",
    color = "Significant"
  ) +
  theme_Publication()

Plot the technical variation

I generated this with ChaptGPT io

I could not use edgeR BCV function bacause Prime-seq had too few counts to pass the filter by expression step.

# Start with unfiltered dgelist object
dge <- DGEList(counts = counts, 
               group = tb$Protocol)
# Calculate normalized counts (log-transformed counts per million)
logCPM <- edgeR::cpm(dge, log=TRUE, normalized.lib.sizes=TRUE)
# Convert to a data frame
logCPM_df <- as.data.frame(logCPM)
logCPM_df$Gene <- rownames(logCPM_df)

# Reshape the data to long format
library(reshape2)
logCPM_long <- melt(logCPM_df, id.vars = "Gene", variable.name = "Sample", value.name = "Expression")

# Add group information to the data
logCPM_long$Group <- dge$samples$group[match(logCPM_long$Sample, rownames(dge$samples))]

# Calculate mean expression per gene within each group
gene_stats <- logCPM_long %>%
  group_by(Gene, Group) %>%
  summarize(
    Variance = var(Expression, na.rm = TRUE),
    MeanExpression = mean(Expression, na.rm = TRUE)
  )

Plot variance vs. mean expression

The shape here is typical of unfiltered counts which is what I supplied to the plot.
If I filter counts I lose too many genes from Prime-seq.

ggplot(gene_stats, aes(x=MeanExpression, y=Variance, color=Group)) +
  geom_point(alpha=0.5) +
  theme_Publication() +
  scale_colour_brewer(palette = "Dark2") +
  xlab("Mean Genewise log-CPM") + ylab("Variance") + labs(color="Protocol")

Conclusion

TIRE-seq better in all metrics

Session info

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 9.4 (Plow)
## 
## Matrix products: default
## BLAS:   /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRblas.so 
## LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRlapack.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] reshape2_1.4.4              ggthemes_5.1.0             
##  [3] here_1.0.1                  scater_1.32.1              
##  [5] scuttle_1.14.0              SingleCellExperiment_1.26.0
##  [7] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [9] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
## [11] IRanges_2.38.1              S4Vectors_0.42.1           
## [13] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
## [15] matrixStats_1.4.1           edgeR_4.2.2                
## [17] limma_3.60.6                patchwork_1.3.0            
## [19] platetools_0.1.7            lubridate_1.9.3            
## [21] forcats_1.0.0               stringr_1.5.1              
## [23] dplyr_1.1.4                 purrr_1.0.2                
## [25] readr_2.1.5                 tidyr_1.3.1                
## [27] tibble_3.2.1                ggplot2_3.5.1              
## [29] tidyverse_2.0.0            
## 
## loaded via a namespace (and not attached):
##  [1] gridExtra_2.3             rlang_1.1.4              
##  [3] magrittr_2.0.3            compiler_4.4.1           
##  [5] mgcv_1.9-1                DelayedMatrixStats_1.26.0
##  [7] vctrs_0.6.5               pkgconfig_2.0.3          
##  [9] crayon_1.5.3              fastmap_1.2.0            
## [11] XVector_0.44.0            labeling_0.4.3           
## [13] utf8_1.2.4                rmarkdown_2.28           
## [15] tzdb_0.4.0                UCSC.utils_1.0.0         
## [17] ggbeeswarm_0.7.2          xfun_0.48                
## [19] zlibbioc_1.50.0           cachem_1.1.0             
## [21] beachmat_2.20.0           jsonlite_1.8.9           
## [23] highr_0.11                DelayedArray_0.30.1      
## [25] BiocParallel_1.38.0       irlba_2.3.5.1            
## [27] parallel_4.4.1            R6_2.5.1                 
## [29] bslib_0.8.0               stringi_1.8.4            
## [31] RColorBrewer_1.1-3        jquerylib_0.1.4          
## [33] Rcpp_1.0.13               knitr_1.48               
## [35] splines_4.4.1             Matrix_1.7-0             
## [37] timechange_0.3.0          tidyselect_1.2.1         
## [39] rstudioapi_0.17.0         abind_1.4-8              
## [41] yaml_2.3.10               viridis_0.6.5            
## [43] codetools_0.2-20          plyr_1.8.9               
## [45] lattice_0.22-6            withr_3.0.1              
## [47] evaluate_1.0.1            pillar_1.9.0             
## [49] generics_0.1.3            rprojroot_2.0.4          
## [51] hms_1.1.3                 sparseMatrixStats_1.16.0 
## [53] munsell_0.5.1             scales_1.3.0             
## [55] glue_1.8.0                tools_4.4.1              
## [57] BiocNeighbors_1.22.0      ScaledMatrix_1.12.0      
## [59] locfit_1.5-9.10           colorspace_2.1-1         
## [61] nlme_3.1-164              GenomeInfoDbData_1.2.12  
## [63] beeswarm_0.4.0            BiocSingular_1.20.0      
## [65] vipor_0.4.7               cli_3.6.3                
## [67] rsvd_1.0.5                fansi_1.0.6              
## [69] S4Arrays_1.4.1            viridisLite_0.4.2        
## [71] gtable_0.3.5              sass_0.4.9               
## [73] digest_0.6.37             SparseArray_1.4.8        
## [75] ggrepel_0.9.6             farver_2.1.2             
## [77] htmltools_0.5.8.1         lifecycle_1.0.4          
## [79] httr_1.4.7                statmod_1.5.0